Prediction using Supervised ML
Graduate Rotational Internship Program The Sparks Foundation
Data Science & Business Analytics Tasks
Prediction using Supervised ML
Introduction
Linear Regression with R Programming (R-studio)
In this project the we can see how a Simple Linear Regression can be used for prediction for the percentage of an student based on the no. of study hours.
It is implementing using R programming -rmarkdown report.
We will start with simple linear regression involving two variables.
Problem Statement
- In this we need to predict the percentage of marks based on the number of hours they study.
- Here we are using two variables Hours and Scores for the regression.
Data Collection
- Collecting the student data from url
- Reading the data into csv file
library(readr)
library(tidyverse)
library(tidymodels)
library(ggplot2)
student_scoresURL <- "https://raw.githubusercontent.com/AdiPersonalWorks/Random/master/student_scores%20-%20student_scores.csv"
student_scores <- read.csv(file = student_scoresURL,
header = TRUE,
sep = ",",
stringsAsFactors = FALSE)
student_scores## Hours Scores
## 1 2.5 21
## 2 5.1 47
## 3 3.2 27
## 4 8.5 75
## 5 3.5 30
## 6 1.5 20
## 7 9.2 88
## 8 5.5 60
## 9 8.3 81
## 10 2.7 25
## 11 7.7 85
## 12 5.9 62
## 13 4.5 41
## 14 3.3 42
## 15 1.1 17
## 16 8.9 95
## 17 2.5 30
## 18 1.9 24
## 19 6.1 67
## 20 7.4 69
## 21 2.7 30
## 22 4.8 54
## 23 3.8 35
## 24 6.9 76
## 25 7.8 86
- writing/saving the student_scores file into local directory
write.table(student_scores, file = "student_scores.csv ",
row.names = FALSE,
col.names = TRUE,
sep = "\t",
quote = FALSE)- Checking Basic Information about the student_scores data
- Checking summary and structure of the data
head(student_scores)## Hours Scores
## 1 2.5 21
## 2 5.1 47
## 3 3.2 27
## 4 8.5 75
## 5 3.5 30
## 6 1.5 20
dim(student_scores)## [1] 25 2
str(student_scores) ## 'data.frame': 25 obs. of 2 variables:
## $ Hours : num 2.5 5.1 3.2 8.5 3.5 1.5 9.2 5.5 8.3 2.7 ...
## $ Scores: int 21 47 27 75 30 20 88 60 81 25 ...
summary(student_scores) ## Hours Scores
## Min. :1.100 Min. :17.00
## 1st Qu.:2.700 1st Qu.:30.00
## Median :4.800 Median :47.00
## Mean :5.012 Mean :51.48
## 3rd Qu.:7.400 3rd Qu.:75.00
## Max. :9.200 Max. :95.00
Data Preparation
library(caTools)
set <- sample(2, nrow(student_scores),
replace = TRUE,
prob = c(0.7, 0.3))
train <- student_scores[set==1,]
train## Hours Scores
## 1 2.5 21
## 2 5.1 47
## 3 3.2 27
## 4 8.5 75
## 5 3.5 30
## 6 1.5 20
## 7 9.2 88
## 8 5.5 60
## 9 8.3 81
## 10 2.7 25
## 11 7.7 85
## 12 5.9 62
## 13 4.5 41
## 14 3.3 42
## 16 8.9 95
## 17 2.5 30
## 18 1.9 24
## 19 6.1 67
## 20 7.4 69
## 21 2.7 30
## 24 6.9 76
## 25 7.8 86
test <- student_scores[set==2,]
test## Hours Scores
## 15 1.1 17
## 22 4.8 54
## 23 3.8 35
Data Cleaning
- Checking for Missing Values/NA/NAN
library(DataExplorer)
sum(is.na(train))## [1] 0
sum(is.na(test))## [1] 0
plot_missing(train) # Exploratory Data Analysis (EDA) * describe - can computes the statistics of all numerical variables
library(Hmisc)
describe(train)## train
##
## 2 Variables 22 Observations
## --------------------------------------------------------------------------------
## Hours
## n missing distinct Info Mean Gmd .05 .10
## 22 0 20 0.999 5.255 2.968 1.930 2.500
## .25 .50 .75 .90 .95
## 2.825 5.300 7.625 8.480 8.880
##
## lowest : 1.5 1.9 2.5 2.7 3.2, highest: 7.8 8.3 8.5 8.9 9.2
##
## Value 1.5 1.9 2.5 2.7 3.2 3.3 3.5 4.5 5.1 5.5 5.9
## Frequency 1 1 2 2 1 1 1 1 1 1 1
## Proportion 0.045 0.045 0.091 0.091 0.045 0.045 0.045 0.045 0.045 0.045 0.045
##
## Value 6.1 6.9 7.4 7.7 7.8 8.3 8.5 8.9 9.2
## Frequency 1 1 1 1 1 1 1 1 1
## Proportion 0.045 0.045 0.045 0.045 0.045 0.045 0.045 0.045 0.045
## --------------------------------------------------------------------------------
## Scores
## n missing distinct Info Mean Gmd .05 .10
## 22 0 20 0.998 53.68 29.83 21.15 24.10
## .25 .50 .75 .90 .95
## 30.00 53.50 75.75 85.90 87.90
##
## lowest : 20 21 24 25 27, highest: 81 85 86 88 95
##
## Value 20 21 24 25 27 30 41 42 47 60 62
## Frequency 1 1 1 1 1 3 1 1 1 1 1
## Proportion 0.045 0.045 0.045 0.045 0.045 0.136 0.045 0.045 0.045 0.045 0.045
##
## Value 67 69 75 76 81 85 86 88 95
## Frequency 1 1 1 1 1 1 1 1 1
## Proportion 0.045 0.045 0.045 0.045 0.045 0.045 0.045 0.045 0.045
## --------------------------------------------------------------------------------
describe(test)## test
##
## 2 Variables 3 Observations
## --------------------------------------------------------------------------------
## Hours
## n missing distinct Info Mean Gmd
## 3 0 3 1 3.233 2.467
##
## Value 1.1 3.8 4.8
## Frequency 1 1 1
## Proportion 0.333 0.333 0.333
## --------------------------------------------------------------------------------
## Scores
## n missing distinct Info Mean Gmd
## 3 0 3 1 35.33 24.67
##
## Value 17 35 54
## Frequency 1 1 1
## Proportion 0.333 0.333 0.333
## --------------------------------------------------------------------------------
- Two continuous variables
library(ggplot2)
q <- ggplot(data = train, aes(x = Hours, y = Scores))+
geom_line(colour = "skyblue") +
geom_point(colour = "blue")
qggplot(data=train,aes(x=Hours,y=Scores)) +
geom_bar(stat ='identity',aes(fill=Scores))+
coord_flip() +
theme_grey() +
scale_fill_gradient(name="Score Level")+
labs(title = 'Scores according to hours',
y='Score per hour',x='Hours of study')+
geom_hline(yintercept = mean(student_scores$Scores),size = 1, color = 'red')library(plotly)
plot_ly(train, x = ~Hours, y = ~Scores, type = 'bar', mode = 'markers',marker = list(color = "DARKGREEN", opacity = 0.5), size = 4) %>%
layout(title = 'Scores according to hours',
yaxis = list(title = 'Scores'),
xaxis = list(title = 'Hours of study') )Modelling
- Simple Linear Regression
model <- lm( Scores~ Hours, data=train) #model building
model##
## Call:
## lm(formula = Scores ~ Hours, data = train)
##
## Coefficients:
## (Intercept) Hours
## 1.887 9.857
- Summary of model
summary(model) ##
## Call:
## lm(formula = Scores ~ Hours, data = train)
##
## Residuals:
## Min 1Q Median 3Q Max
## -10.673 -5.223 1.727 4.713 7.585
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 1.887 2.881 0.655 0.52
## Hours 9.857 0.496 19.873 1.22e-14 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 5.761 on 20 degrees of freedom
## Multiple R-squared: 0.9518, Adjusted R-squared: 0.9494
## F-statistic: 394.9 on 1 and 20 DF, p-value: 1.219e-14
#plot
plot(Scores~ Hours, train, pch =4, frame = FALSE, col = "red")
abline(model, col = "blue")Model Prediction
# Prediction
cof <- coef(summary(model))
cof## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 1.886629 2.8812274 0.6548003 5.200567e-01
## Hours 9.857216 0.4960066 19.8731554 1.219422e-14
pred <- predict(model,train)
pred## 1 2 3 4 5 6 7 8
## 26.52967 52.15843 33.42972 85.67296 36.38688 16.67245 92.57302 56.10132
## 9 10 11 12 13 14 16 17
## 83.70152 28.50111 77.78719 60.04420 46.24410 34.41544 89.61585 26.52967
## 18 19 20 21 24 25
## 20.61534 62.01565 74.83003 28.50111 69.90142 78.77291
train_pred <- data.frame(train,pred)
train_pred## Hours Scores pred
## 1 2.5 21 26.52967
## 2 5.1 47 52.15843
## 3 3.2 27 33.42972
## 4 8.5 75 85.67296
## 5 3.5 30 36.38688
## 6 1.5 20 16.67245
## 7 9.2 88 92.57302
## 8 5.5 60 56.10132
## 9 8.3 81 83.70152
## 10 2.7 25 28.50111
## 11 7.7 85 77.78719
## 12 5.9 62 60.04420
## 13 4.5 41 46.24410
## 14 3.3 42 34.41544
## 16 8.9 95 89.61585
## 17 2.5 30 26.52967
## 18 1.9 24 20.61534
## 19 6.1 67 62.01565
## 20 7.4 69 74.83003
## 21 2.7 30 28.50111
## 24 6.9 76 69.90142
## 25 7.8 86 78.77291
Comparing actual values with Predicted values
Scores_comp <- data.frame(Actual = train$Scores, Predcited = train_pred$pred)
Scores_comp## Actual Predcited
## 1 21 26.52967
## 2 47 52.15843
## 3 27 33.42972
## 4 75 85.67296
## 5 30 36.38688
## 6 20 16.67245
## 7 88 92.57302
## 8 60 56.10132
## 9 81 83.70152
## 10 25 28.50111
## 11 85 77.78719
## 12 62 60.04420
## 13 41 46.24410
## 14 42 34.41544
## 15 95 89.61585
## 16 30 26.52967
## 17 24 20.61534
## 18 67 62.01565
## 19 69 74.83003
## 20 30 28.50111
## 21 76 69.90142
## 22 86 78.77291
Actual vs Predicted Graph
Actual <- c(Scores_comp$Actual)
Predcited <- c(Scores_comp$Predcited)
plot(Actual, type = "b", frame = FALSE, pch = 10,
col = "#4AC6B7", xlab = "x", ylab = "y")
lines(Predcited, pch = 10, col = "#C61951", type = "b", lty = 2)
legend("topleft", legend=c("Actual", "Predicted"), col=c("#4AC6B7", "#C61951"),
pch = 10, lty = 1:1, cex=0.6)Model Diagnostics
par(mfrow=c(2,2))
plot(model) ## Test data Evoluation
pred <- predict(model,test)
tibble(pred)## # A tibble: 3 x 1
## pred
## <dbl>
## 1 12.7
## 2 49.2
## 3 39.3
test_pred <- data.frame(test,pred)
test_pred## Hours Scores pred
## 15 1.1 17 12.72957
## 22 4.8 54 49.20127
## 23 3.8 35 39.34405
Actual vs Predicted graph
Scores_comp2 <- data.frame(Actual = test$Scores, Predcited = test_pred$pred)
Scores_comp2## Actual Predcited
## 1 17 12.72957
## 2 54 49.20127
## 3 35 39.34405
The predicted score if a student studies for 9.25 hrs/ day
- Predicting score if a student studies for 9.25 hrs/day
pred_hour <- predict(model, data.frame(Hours=9.25))
pred_hour## 1
## 93.06588
#(or)
coef(model)[1] + 9.25*coef(model)[2] #Mathematical form## (Intercept)
## 93.06588
library(highcharter)
hc <- train_pred_new %>% group_by(Scores, Hours) %>%
hchart('column', hcaes(x = 'Hours', y = 'Scores') )
hc